In this assignment,we try to understand if we can obtain significant information regarding the game outcomes using the odd data from multiple bookmakers.

Some certain R packages such as data.table,ggbiplot,ggplot2 and imager are used for this homework.

In the first place, data consisting of games results and odd information are prepared.

# importing data
matches_rawdata <- data.table(readRDS("matches.rds"))
odd_rawdata <- data.table(readRDS("odd_details.rds"))

# match
matches_rawdata_ <- na.omit(matches_rawdata, score = NA)
matches_rawdata_[, "match_date"] <- as.POSIXct(matches_rawdata_$date, 
    origin = "1970-01-01", "GMT")
strsplit(matches_rawdata_$score, ":")
splitted <- t(sapply(matches_rawdata_$score, function(x) substring(x, 
    first = c(1, 3), last = c(1, 3))))
matches_rawdata_[, "number_of_goals"] <- as.numeric(splitted[, 
    1]) + as.numeric(splitted[, 2])
matches_rawdata_[, c(2, 5, 8)]  #seems ok
matches_rawdata_[, "over_flag"] <- sapply(matches_rawdata_$number_of_goals, 
    function(x) if (x >= 3) return(1) else if (x < 3) return(0))

TASK1

A-B-C)

I selected 5 bookmakers which are basically bet365,youwin,bwin,Unibet and William Hill. Then, I created different feature vectors using different odd types for each bookmaker and apply PCA to these data. Furthermore, final odds are used while generating the columns.

For bookmaker bwin:

# bookmaker1 = bwin ou2.5
bwin_ou = odd_rawdata[bookmaker == "bwin" & betType == "ou" & 
    totalhandicap == 2.5]
bwin_ou
bwin_ou_rev = bwin_ou[, `:=`(row_number, rank(-date)), by = c("matchId", 
    "oddtype")][row_number == 1]

bwin_ou_data = dcast(bwin_ou_rev, matchId ~ oddtype, value.var = "odd")
colnames(bwin_ou_data) <- c("matchId", "over2.5", "under2.5")

# bts
bwin_bts = odd_rawdata[bookmaker == "bwin" & betType == "bts"]
bwin_bts
bwin_bts_rev = bwin_bts[, `:=`(row_number, rank(-date)), by = c("matchId", 
    "oddtype")][row_number == 1]

bwin_bts_data = dcast(bwin_bts_rev, matchId ~ oddtype, value.var = "odd")
colnames(bwin_bts_data) <- c("matchId", "BTSNO", "BTSYES")

# ha
bwin_ha = odd_rawdata[bookmaker == "bwin" & betType == "ha"]
bwin_ha
bwin_ha_rev = bwin_ha[, `:=`(row_number, rank(-date)), by = c("matchId", 
    "oddtype")][row_number == 1]
bwin_ha_rev

bwin_ha_data = dcast(bwin_ha_rev, matchId ~ oddtype, value.var = "odd")
colnames(bwin_ha_data) <- c("matchId", "ha1", "ha2")

# 1x2
bwin_1x2 = odd_rawdata[bookmaker == "bwin" & betType == "1x2"]
bwin_1x2
bwin_1x2_rev = bwin_1x2[, `:=`(row_number, rank(-date)), by = c("matchId", 
    "oddtype")][row_number == 1]

bwin_1x2_data = dcast(bwin_1x2_rev, matchId ~ oddtype, value.var = "odd")

merged_1 = merge(bwin_ou_data, bwin_bts_data, by = "matchId")
merged_2 = merge(merged_1, bwin_ha_data, by = "matchId")
merged_3 = merge(merged_2, bwin_1x2_data, by = "matchId")

PCA_data_bwin = merge(merged_3, matches_rawdata_[, c(2, 5, 10)], 
    by = "matchId")
PCA_data_bwin[, "ou_status"] <- sapply(PCA_data_bwin$over_flag, 
    function(x) if (x == 1) return("over") else if (x == 0) return("under"))

PCA_bwin_result <- prcomp(PCA_data_bwin[, c(2:10)], center = TRUE, 
    scale. = TRUE)
head(PCA_data_bwin, 10)
##      matchId over2.5 under2.5 BTSNO BTSYES  ha1   ha2 odd1  odd2 oddX
##  1: 02oVDuv1    1.95     1.78  1.83   1.87 1.30  3.20 1.83  4.60 3.50
##  2: 04zko0D5    1.44     2.65  1.60   2.20 1.02 10.50 1.16 18.50 7.25
##  3: 06H610QA    1.75     2.00  2.20   1.60 1.72  2.00 2.45  2.85 3.40
##  4: 0CKxvS3A    1.44     2.65  1.83   1.87 1.04  8.50 1.20 12.00 6.50
##  5: 0Ct34Nck    1.80     1.95  2.10   1.65 1.62  2.15 2.30  3.10 3.40
##  6: 0CxMddMQ    1.62     2.20  1.95   1.75 1.13  5.00 1.45  6.75 4.33
##  7: 0Ea58aKB    1.57     2.30  2.20   1.60 3.20  1.30 4.33  1.72 4.20
##  8: 0Ek9mv4n    2.20     1.62  1.65   2.10 1.25  3.60 1.72  5.50 3.50
##  9: 0GC3r61r    2.05     1.72  1.90   1.80 1.57  2.25 2.35  3.20 3.20
## 10: 0IZhTKwl    2.35     1.55  1.72   2.00 1.60  2.20 2.35  3.30 3.10
##     score over_flag ou_status
##  1:   0:1         0     under
##  2:   4:1         1      over
##  3:   2:2         1      over
##  4:   3:0         1      over
##  5:   0:0         0     under
##  6:   0:1         0     under
##  7:   2:3         1      over
##  8:   1:0         0     under
##  9:   1:2         1      over
## 10:   2:2         1      over
PCA_bwin_result
## Standard deviations (1, .., p=9):
## [1] 2.08541465 1.56670126 1.42040750 0.30299582 0.21074767 0.15132915
## [7] 0.11806932 0.06097334 0.04643014
## 
## Rotation (n x k) = (9 x 9):
##                 PC1         PC2         PC3          PC4         PC5
## over2.5  -0.2798133  0.49011052  0.07264234 -0.799939882 -0.04260155
## under2.5  0.3566622 -0.41356673 -0.01796536 -0.404616717  0.16511145
## BTSNO    -0.1861288 -0.39289767 -0.46887359 -0.172744777 -0.73574234
## BTSYES    0.2151600  0.37675252  0.45989721  0.174476911 -0.59912068
## ha1      -0.1954218 -0.35677910  0.50705078 -0.003802246 -0.06190032
## ha2       0.4734049  0.07536622 -0.05243901  0.052221512 -0.04799745
## odd1     -0.2084655 -0.33892188  0.50999060 -0.074552966 -0.10138177
## odd2      0.4724556  0.08088614 -0.03619380 -0.087402943 -0.22879021
## oddX      0.4296334 -0.19509281  0.20445464 -0.346640426  0.04361540
##                  PC6         PC7          PC8         PC9
## over2.5   0.07993513 -0.13380367  0.009268162 -0.10060624
## under2.5 -0.59694303 -0.36482328 -0.057105209  0.13325700
## BTSNO     0.01984475  0.06301625  0.053556637 -0.11293495
## BTSYES   -0.45654497  0.04569211  0.013297138  0.01313696
## ha1       0.15706207 -0.17577641 -0.550103877 -0.46410373
## ha2       0.26974248 -0.41787100  0.433026544 -0.57230927
## odd1      0.25201813 -0.16205705  0.561965723  0.40020768
## odd2      0.50028614 -0.21008351 -0.425211922  0.48344848
## oddX      0.13119990  0.75301437  0.082891822 -0.14958368
summary(PCA_bwin_result)
## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6     PC7
## Standard deviation     2.0854 1.5667 1.4204 0.3030 0.21075 0.15133 0.11807
## Proportion of Variance 0.4832 0.2727 0.2242 0.0102 0.00493 0.00254 0.00155
## Cumulative Proportion  0.4832 0.7560 0.9801 0.9903 0.99525 0.99780 0.99935
##                            PC8     PC9
## Standard deviation     0.06097 0.04643
## Proportion of Variance 0.00041 0.00024
## Cumulative Proportion  0.99976 1.00000
ggbiplot(PCA_bwin_result, ellipse = TRUE, groups = PCA_data_bwin$ou_status) + 
    ggtitle("PCA results for bwin")

PCA results of bwin are shown above.As you can see, there are 9 eigenvectors since we have 9 elements in the feature vector. In the summary table, PC1 has the biggest standard deviation value. Then, other components follow it. PC1 and PC2 cover about 75% of the variance. Variance covered increase to 98% when PC3 is added.

According to the plot of PCA results for bwin, over and under results look like spread out randomly. The red and blue circles show the regions where over and under results are placed mostly. The circles are intersecting which means that the results are not fully separated. Additionally, the arrows tell us that which columns are joined which principal components. In this case, oddX,odd2 and under2.5 seem to be in PC1 vector.

R codes for remaining bookmakers can be found in the Appendix.

For bookmaker bet365:

head(PCA_data_bet365, 10)
##      matchId over1.5 under1.5 over3.5 under3.5  ha1   ha2 odd1  odd2 oddX
##  1: 004f4ING    2.05     1.72    3.75     1.25 1.20  4.33 1.67  6.00  3.9
##  2: 02oVDuv1    1.95     1.78    3.50     1.28 1.30  3.39 1.85  4.75  3.6
##  3: 04PCiQzK    1.80     1.95    3.00     1.36 1.13  5.50 1.50  7.00  4.0
##  4: 04vrPwsg    2.25     1.60    4.00     1.22 1.44  2.63 2.05  4.10  3.4
##  5: 04zko0D5    1.44     2.65    2.10     1.66 1.02 19.00 1.14 23.00  9.0
##  6: 061xSktd    1.65     2.15    2.38     1.53 1.57  2.25 2.25  3.25  3.3
##  7: 069WnDBn    2.20     1.62    3.75     1.25 2.63  1.44 3.90  2.15  3.3
##  8: 06AqtiDs    1.62     2.20    2.50     1.50 1.13  5.50 1.44  7.00  4.5
##  9: 06H610QA    1.75     2.00    3.00     1.36 1.80  1.90 2.64  2.75  3.5
## 10: 086p9yzH    1.87     1.87    3.25     1.33 1.67  2.10 2.40  3.00  3.3
##     score over_flag ou_status
##  1:   0:1         0     under
##  2:   0:1         0     under
##  3:   3:1         1      over
##  4:   0:2         0     under
##  5:   4:1         1      over
##  6:   2:3         1      over
##  7:   0:0         0     under
##  8:   0:2         0     under
##  9:   2:2         1      over
## 10:   1:2         1      over
PCA_bet365_result
## Standard deviations (1, .., p=9):
## [1] 2.36409429 1.57593215 0.84536072 0.34335871 0.22784460 0.15313206
## [7] 0.08804866 0.07987171 0.07396927
## 
## Rotation (n x k) = (9 x 9):
##                  PC1         PC2        PC3         PC4          PC5
## over1.5  -0.36804847 -0.20936373  0.3744348  0.37321625  0.554471322
## under1.5  0.39830805  0.15847070 -0.1067110  0.50375118 -0.450039556
## over3.5  -0.36157252 -0.20695978  0.4185320  0.43186695 -0.526767464
## under3.5  0.39668999  0.15664471 -0.1186732  0.55257543  0.460150004
## ha1      -0.06613165  0.60027037  0.3242951 -0.11222475  0.002102180
## ha2       0.36051175 -0.26079045  0.3574711 -0.21406876  0.017017746
## odd1     -0.09483397  0.59439408  0.3079438 -0.08567781  0.019281535
## odd2      0.35395177 -0.28427371  0.3516468 -0.21954976  0.008025843
## oddX      0.38623299  0.03509877  0.4557645  0.03366143  0.008587737
##                   PC6          PC7         PC8         PC9
## over1.5  -0.004340672  0.031755607  0.47518691  0.08334911
## under1.5 -0.099862764  0.033025953  0.56524528  0.13410486
## over3.5  -0.105349101 -0.006673914 -0.41300847 -0.07451137
## under3.5 -0.200493480  0.013091515 -0.49170411 -0.06925729
## ha1      -0.193846313  0.473777356  0.09698721 -0.49607268
## ha2      -0.466602687 -0.512239985  0.12472895 -0.36394201
## odd1     -0.122255609 -0.432519700 -0.08185358  0.57134983
## odd2     -0.226138197  0.562474168 -0.09177761  0.49603003
## oddX      0.785705505 -0.085905022 -0.04753175 -0.11701278
summary(PCA_bet365_result)
## Importance of components:
##                          PC1    PC2    PC3    PC4     PC5     PC6     PC7
## Standard deviation     2.364 1.5759 0.8454 0.3434 0.22784 0.15313 0.08805
## Proportion of Variance 0.621 0.2759 0.0794 0.0131 0.00577 0.00261 0.00086
## Cumulative Proportion  0.621 0.8969 0.9764 0.9895 0.99522 0.99782 0.99868
##                            PC8     PC9
## Standard deviation     0.07987 0.07397
## Proportion of Variance 0.00071 0.00061
## Cumulative Proportion  0.99939 1.00000
ggbiplot(PCA_bet365_result, ellipse = TRUE, groups = PCA_data_bet365$ou_status) + 
    ggtitle("PCA results for bet365")

PCA results of bet365 are shown above.As you can see, there are 9 eigenvectors since we have 9 elements in the feature vector.PC1 and PC2 cover about 90% of the variance. Variance covered increase to 98% when PC3 is added.

According to the plot of PCA results for bet365, over and under results look like aggregated in the middle. Around over3.5 region, blue points are gathered together. Also, around under 3.5, oddX and ha2 region, red points are placed intensively. Again, the circles are intersecting which means that the results are not fully separated.

For bookmaker Unibet:

head(PCA_data_Unibet, 10)
##      matchId over0.5 under0.5 odd1  odd2 oddX   12   1X   X2 score
##  1: 004f4ING    1.07      9.5 1.66  5.70 3.80 1.29 1.16 2.28   0:1
##  2: 02oVDuv1    1.07     10.5 1.85  4.80 3.65 1.30 1.21 2.04   0:1
##  3: 04PCiQzK    1.02     11.0 1.53  6.75 3.70 1.25 1.10 2.40   3:1
##  4: 04vrPwsg    1.09      8.0 2.00  4.15 3.35 1.35 1.25 1.85   0:2
##  5: 04zko0D5    1.02     25.0 1.16 23.00 8.50 1.08 1.01 5.80   4:1
##  6: 069WnDBn    1.10      8.5 3.80  2.14 3.40 1.35 1.77 1.29   0:0
##  7: 06H610QA    1.05     13.0 2.60  2.80 3.55 1.32 1.46 1.54   2:2
##  8: 086p9yzH    1.02     10.5 2.35  3.05 3.35 1.33 1.40 1.60   1:2
##  9: 08lLzk5P    1.09      9.0 3.10  2.57 3.20 1.38 1.55 1.40   1:4
## 10: 08rn2qHj    1.08      9.5 1.66  6.00 3.95 1.28 1.15 2.33   3:0
##     over_flag ou_status
##  1:         0     under
##  2:         0     under
##  3:         1      over
##  4:         0     under
##  5:         1      over
##  6:         0     under
##  7:         1      over
##  8:         1      over
##  9:         1      over
## 10:         1      over
PCA_Unibet_result
## Standard deviations (1, .., p=8):
## [1] 2.11728940 1.56625912 0.87978044 0.44580477 0.26604379 0.11828021
## [7] 0.07162311 0.03554003
## 
## Rotation (n x k) = (8 x 8):
##                 PC1        PC2         PC3         PC4         PC5
## over0.5  -0.2502421  0.2188875 -0.86833186 -0.25962542 -0.25817429
## under0.5  0.3882441 -0.2629214  0.10922977 -0.85269549 -0.08234029
## odd1     -0.1550665 -0.5885407 -0.20075687  0.13631212  0.22836491
## odd2      0.4370209  0.1906082 -0.20828380  0.20624488  0.31717799
## oddX      0.4266731 -0.2059806 -0.29587687  0.03761895  0.26350810
## 12       -0.4110453  0.2640557  0.05215001 -0.35382806  0.79263441
## 1X       -0.1579705 -0.5925083 -0.16857253  0.07781381  0.15263303
## X2        0.4423231  0.1924300 -0.17805540  0.10827108  0.22980653
##                  PC6          PC7          PC8
## over0.5   0.03666288 -0.006407559  0.004732455
## under0.5  0.18405785 -0.013391129  0.016781676
## odd1      0.39935311  0.560051192 -0.213070621
## odd2      0.56438061 -0.439283048 -0.273249370
## oddX     -0.69029413  0.028999876 -0.373871974
## 12       -0.07151930 -0.004117788 -0.001063015
## 1X       -0.06995102 -0.575699022  0.479466529
## X2        0.01600058  0.401080259  0.714116333
summary(PCA_Unibet_result)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6
## Standard deviation     2.1173 1.5663 0.87978 0.44580 0.26604 0.11828
## Proportion of Variance 0.5604 0.3066 0.09675 0.02484 0.00885 0.00175
## Cumulative Proportion  0.5604 0.8670 0.96376 0.98860 0.99745 0.99920
##                            PC7     PC8
## Standard deviation     0.07162 0.03554
## Proportion of Variance 0.00064 0.00016
## Cumulative Proportion  0.99984 1.00000
ggbiplot(PCA_Unibet_result, ellipse = TRUE, groups = PCA_data_Unibet$ou_status) + 
    ggtitle("PCA results for Unibet")

PCA results of Unibet are shown above.As you can see, there are 8 eigenvectors since we have 8 elements in the feature vector.PC1 and PC2 cover about 86% of the variance. Variance covered increase to 96% when PC3 is added.

According to the plot of PCA results for Unibet, over and under results look like creating V shape at the top left corner. Around over0.5 and 12 region, blue points are gathered together. Again, the circles are intersecting which means that the results are not fully separated.

For bookmaker youwin:

head(PCA_data_youwin, 10)
##      matchId   12   1X   X2   NO  YES  ha1  ha2 score over_flag ou_status
##  1: 004f4ING 1.24 1.14 2.08 1.66 2.05 1.21 3.78   0:1         0     under
##  2: 04vrPwsg 1.31 1.23 1.72 1.71 1.97 1.39 2.71   0:2         0     under
##  3: 069WnDBn 1.30 1.70 1.26 1.83 1.91 2.55 1.44   0:0         0     under
##  4: 08lLzk5P 1.35 1.48 1.36 1.91 1.80 1.95 1.75   1:4         1      over
##  5: 08rn2qHj 1.25 1.13 2.25 1.75 1.95 1.20 4.00   3:0         1      over
##  6: 0EJYUPXA 1.30 1.73 1.28 1.91 1.83 2.55 1.48   3:0         1      over
##  7: 0GdvciNJ 1.25 1.25 1.70 2.35 1.50 1.45 2.55   1:0         0     under
##  8: 0GvFEVJf 1.25 1.16 2.00 2.00 1.70 1.25 3.45   1:2         1      over
##  9: 0Iej4ETg 1.25 1.18 1.95 1.75 1.95 1.25 3.35   4:0         1      over
## 10: 0IgMkoim 1.22 1.15 2.15 2.00 1.73 1.25 3.75   1:0         0     under
PCA_youwin_result
## Standard deviations (1, .., p=7):
## [1] 1.95360134 1.36303381 1.11298213 0.23015132 0.16372802 0.06687309
## [7] 0.05101876
## 
## Rotation (n x k) = (7 x 7):
##            PC1         PC2        PC3        PC4        PC5         PC6
## 12   0.2774269  0.29354001 -0.6558053 -0.3134543  0.5531890 -0.04729558
## 1X   0.3865595 -0.43426436  0.2461849 -0.1475173  0.2534862 -0.14717774
## X2  -0.4895621  0.04149191  0.2418172 -0.1926429  0.4613578  0.65601156
## NO   0.2735096  0.50198693  0.4234313 -0.6488361 -0.2682627  0.01954005
## YES -0.2892333 -0.50535997 -0.3843247 -0.6048314 -0.3809852  0.04770329
## ha1  0.3658926 -0.46471792  0.2595740 -0.1263458  0.2882602  0.11416289
## ha2 -0.4915224  0.01359525  0.2376102 -0.2002527  0.3415657 -0.72805226
##              PC7
## 12  -0.016715856
## 1X   0.702638847
## X2   0.140777667
## NO  -0.023928205
## YES  0.001897511
## ha1 -0.686069116
## ha2 -0.122204286
summary(PCA_youwin_result)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6
## Standard deviation     1.9536 1.3630 1.1130 0.23015 0.16373 0.06687
## Proportion of Variance 0.5452 0.2654 0.1770 0.00757 0.00383 0.00064
## Cumulative Proportion  0.5452 0.8106 0.9876 0.99516 0.99899 0.99963
##                            PC7
## Standard deviation     0.05102
## Proportion of Variance 0.00037
## Cumulative Proportion  1.00000
ggbiplot(PCA_youwin_result, ellipse = TRUE, groups = PCA_data_youwin$ou_status) + 
    ggtitle("PCA results for youwin")

PCA results of youwin are shown above.As you can see, there are 7 eigenvectors since we have 7 elements in the feature vector.PC1 and PC2 cover about 81% of the variance. Variance covered increase to 98% when PC3 is added.

According to the plot of PCA results for youwin, over and under results look like separated randomly. In the middle of the plot, blue points are gathered together and red points are spread out at the corners in general.Again, the circles are intersecting which means that the results are not fully separated.

For bookmaker William Hill:

head(PCA_data_WH, 10)
##      matchId odd1  odd2 oddX over4.5 under4.5  ha1  ha2 score over_flag
##  1: 004f4ING 1.62  6.00 3.60    8.00     1.06 1.20 4.33   0:1         0
##  2: 04vrPwsg 1.95  4.33 3.10    9.00     1.05 1.36 3.00   0:2         0
##  3: 061xSktd 2.40  3.20 3.00    4.75     1.14 1.67 2.10   2:3         1
##  4: 086p9yzH 2.45  3.00 3.10    7.00     1.08 1.65 2.15   1:2         1
##  5: 0ABxtTSC 2.05  3.80 3.25    6.00     1.11 1.50 2.50   0:0         0
##  6: 0CKxvS3A 1.25 11.00 6.00    3.25     1.33 1.05 9.00   3:0         1
##  7: 0CxMddMQ 1.57  7.00 3.60    5.50     1.12 1.18 4.40   0:1         0
##  8: 0GdvciNJ 2.05  4.00 3.40    5.00     1.14 1.44 2.62   1:0         0
##  9: 0GvFEVJf 1.67  5.50 3.60    5.50     1.12 1.25 3.75   1:2         1
## 10: 0I1IaJg5 1.62  6.00 3.60    5.00     1.13 1.22 4.00   0:0         0
##     ou_status
##  1:     under
##  2:     under
##  3:      over
##  4:      over
##  5:     under
##  6:      over
##  7:     under
##  8:     under
##  9:      over
## 10:     under
PCA_WH_result
## Standard deviations (1, .., p=7):
## [1] 2.06831821 1.42806041 0.71600293 0.31017601 0.22307402 0.12364345
## [7] 0.09372404
## 
## Rotation (n x k) = (7 x 7):
##                 PC1         PC2        PC3         PC4        PC5
## odd1      0.2862410 -0.54623270  0.2542926 -0.07147878  0.2370959
## odd2     -0.4632403  0.08397542  0.2895875 -0.25545311  0.5055106
## oddX     -0.4167447 -0.25239993  0.4408373  0.06431292 -0.7330653
## over4.5   0.3170424  0.40533473  0.6292778  0.57019471  0.1127070
## under4.5 -0.3791977 -0.37119230 -0.2996737  0.75696950  0.2326475
## ha1       0.2609427 -0.56964856  0.2814690 -0.10027930  0.1303574
## ha2      -0.4662623  0.06551944  0.3035081 -0.13159622  0.2589188
##                  PC6         PC7
## odd1     -0.18875516  0.67681536
## odd2      0.59979319  0.11809516
## oddX      0.12510238  0.10539976
## over4.5   0.03470033 -0.01297228
## under4.5  0.03379623 -0.01874066
## ha1       0.12983572 -0.69590108
## ha2      -0.75482806 -0.17907426
summary(PCA_WH_result)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6
## Standard deviation     2.0683 1.4281 0.71600 0.31018 0.22307 0.12364
## Proportion of Variance 0.6111 0.2913 0.07324 0.01374 0.00711 0.00218
## Cumulative Proportion  0.6111 0.9025 0.97571 0.98945 0.99656 0.99875
##                            PC7
## Standard deviation     0.09372
## Proportion of Variance 0.00125
## Cumulative Proportion  1.00000
ggbiplot(PCA_WH_result, ellipse = TRUE, groups = PCA_data_WH$ou_status) + 
    ggtitle("PCA results for William Hill")

PCA results of William Hill are shown above.As you can see, there are 7 eigenvectors since we have 7 elements in the feature vector.PC1 and PC2 cover about 61% of the variance. Variance covered increase to 90% when PC3 is added.

According to the plot of PCA results for William Hill, over and under results look like creating V shape at the top right corner.Again, the circles are intersecting which means that the results are not fully separated.

We apply multidimensional scaling (MDS) now.

# MDS data are generated
MDS_data_bwin = PCA_data_bwin[, c(2:10)]
MDS_data_bet365 = PCA_data_bet365[, c(2:10)]
MDS_data_Unibet = PCA_data_Unibet[, c(2:9)]
MDS_data_WH = PCA_data_WH[, c(2:8)]
MDS_data_youwin = PCA_data_youwin[, c(2:8)]

# euclidean & manhattan distances and scaling for each
# bookmaker
MDS_euclidean_bwin = dist(MDS_data_bwin)
mds_euclidean_bwin = cmdscale(MDS_euclidean_bwin)
MDS_manhattan_bwin = dist(MDS_data_bwin, method = "manhattan")
mds_manhattan_bwin = cmdscale(MDS_manhattan_bwin)

MDS_euclidean_bet365 = dist(MDS_data_bet365)
mds_euclidean_bet365 = cmdscale(MDS_euclidean_bet365)
MDS_manhattan_bet365 = dist(MDS_data_bet365, method = "manhattan")
mds_manhattan_bet365 = cmdscale(MDS_manhattan_bet365)

MDS_euclidean_Unibet = dist(MDS_data_Unibet)
mds_euclidean_Unibet = cmdscale(MDS_euclidean_Unibet)
MDS_manhattan_Unibet = dist(MDS_data_Unibet, method = "manhattan")
mds_manhattan_Unibet = cmdscale(MDS_manhattan_Unibet)

MDS_euclidean_WH = dist(MDS_data_WH)
mds_euclidean_WH = cmdscale(MDS_euclidean_WH)
MDS_manhattan_WH = dist(MDS_data_WH, method = "manhattan")
mds_manhattan_WH = cmdscale(MDS_manhattan_WH)

MDS_euclidean_youwin = dist(MDS_data_youwin)
mds_euclidean_youwin = cmdscale(MDS_euclidean_youwin)
MDS_manhattan_youwin = dist(MDS_data_youwin, method = "manhattan")
mds_manhattan_youwin = cmdscale(MDS_manhattan_youwin)

par(mfrow = c(1, 2))
qplot(mds_euclidean_bwin[, 1], mds_euclidean_bwin[, 2], main = "MDS for bwin using Euclidean Distance", 
    xlab = "", ylab = "", col = PCA_data_bwin$over_flag)

qplot(mds_manhattan_bwin[, 1], mds_manhattan_bwin[, 2], main = "MDS for bwin using Manhattan Distance", 
    xlab = "", ylab = "", col = PCA_data_bwin$over_flag)

qplot(mds_euclidean_bet365[, 1], mds_euclidean_bet365[, 2], main = "MDS for bet365 using Euclidean Distance", 
    xlab = "", ylab = "", col = PCA_data_bet365$over_flag)

qplot(mds_manhattan_bet365[, 1], mds_manhattan_bet365[, 2], main = "MDS for bet365 using Manhattan Distance", 
    xlab = "", ylab = "", col = PCA_data_bet365$over_flag)

qplot(mds_euclidean_Unibet[, 1], mds_euclidean_Unibet[, 2], main = "MDS for Unibet using Euclidean Distance", 
    xlab = "", ylab = "", col = PCA_data_Unibet$over_flag)

qplot(mds_manhattan_Unibet[, 1], mds_manhattan_Unibet[, 2], main = "MDS for Unibet using Manhattan Distance", 
    xlab = "", ylab = "", col = PCA_data_Unibet$over_flag)

qplot(mds_euclidean_WH[, 1], mds_euclidean_WH[, 2], main = "MDS for William Hill using Euclidean Distance", 
    xlab = "", ylab = "", col = PCA_data_WH$over_flag)

qplot(mds_manhattan_WH[, 1], mds_manhattan_WH[, 2], main = "MDS for William Hill using Manhattan Distance", 
    xlab = "", ylab = "", col = PCA_data_WH$over_flag)

qplot(mds_euclidean_youwin[, 1], mds_euclidean_youwin[, 2], main = "MDS for youwin using Euclidean Distance", 
    xlab = "", ylab = "", col = PCA_data_youwin$over_flag)

qplot(mds_manhattan_youwin[, 1], mds_manhattan_youwin[, 2], main = "MDS for youwin using Manhattan Distance", 
    xlab = "", ylab = "", col = PCA_data_youwin$over_flag)

As you can see, the plots of MDS on euclidean distance and manhattan distance look in similar shapes. Sometimes, the general shape can be rotated in the one with manhattan distance. Also, MDS results using manhattan distance have more variance so that the points in the plot spread out more. Like PCA results, MDS results have commonly V shaped graphs. We can say that they look similar in this sense. Also, the over and under results are not fully separated in the MDS plots like PCA results. Therefore, we could not make a clear classification.

TASK 2

We do the same PCA analysis for match outcomes Home,Tie and Away.

PCA_data_bwin[, `:=`(c("HomeGoals", "AwayGoals"), tstrsplit(score, 
    ":"))]
PCA_data_bwin[, "HTA_difference"] <- as.numeric(PCA_data_bwin$HomeGoals) - 
    as.numeric(PCA_data_bwin$AwayGoals)
PCA_data_bwin[, "HTA_status"] <- sapply(PCA_data_bwin$HTA_difference, 
    function(x) if (x == 0) return("Tie") else if (x > 0) return("Home") else return("Away"))

PCA_data_bet365[, `:=`(c("HomeGoals", "AwayGoals"), tstrsplit(score, 
    ":"))]
PCA_data_bet365[, "HTA_difference"] <- as.numeric(PCA_data_bet365$HomeGoals) - 
    as.numeric(PCA_data_bet365$AwayGoals)
PCA_data_bet365[, "HTA_status"] <- sapply(PCA_data_bet365$HTA_difference, 
    function(x) if (x == 0) return("Tie") else if (x > 0) return("Home") else return("Away"))

PCA_data_Unibet[, `:=`(c("HomeGoals", "AwayGoals"), tstrsplit(score, 
    ":"))]
PCA_data_Unibet[, "HTA_difference"] <- as.numeric(PCA_data_Unibet$HomeGoals) - 
    as.numeric(PCA_data_Unibet$AwayGoals)
PCA_data_Unibet[, "HTA_status"] <- sapply(PCA_data_Unibet$HTA_difference, 
    function(x) if (x == 0) return("Tie") else if (x > 0) return("Home") else return("Away"))

PCA_data_WH[, `:=`(c("HomeGoals", "AwayGoals"), tstrsplit(score, 
    ":"))]
PCA_data_WH[, "HTA_difference"] <- as.numeric(PCA_data_WH$HomeGoals) - 
    as.numeric(PCA_data_WH$AwayGoals)
PCA_data_WH[, "HTA_status"] <- sapply(PCA_data_WH$HTA_difference, 
    function(x) if (x == 0) return("Tie") else if (x > 0) return("Home") else return("Away"))

PCA_data_youwin[, `:=`(c("HomeGoals", "AwayGoals"), tstrsplit(score, 
    ":"))]
PCA_data_youwin[, "HTA_difference"] <- as.numeric(PCA_data_youwin$HomeGoals) - 
    as.numeric(PCA_data_youwin$AwayGoals)
PCA_data_youwin[, "HTA_status"] <- sapply(PCA_data_youwin$HTA_difference, 
    function(x) if (x == 0) return("Tie") else if (x > 0) return("Home") else return("Away"))

par(mfrow = c(1, 2))
ggbiplot(PCA_bwin_result, ellipse = TRUE, groups = PCA_data_bwin$HTA_status) + 
    ggtitle("PCA results for bwin")

ggbiplot(PCA_bet365_result, ellipse = TRUE, groups = PCA_data_bet365$HTA_status) + 
    ggtitle("PCA results for bet365")

ggbiplot(PCA_Unibet_result, ellipse = TRUE, groups = PCA_data_Unibet$HTA_status) + 
    ggtitle("PCA results for Unibet")

ggbiplot(PCA_WH_result, ellipse = TRUE, groups = PCA_data_WH$HTA_status) + 
    ggtitle("PCA results for William Hill")

ggbiplot(PCA_youwin_result, ellipse = TRUE, groups = PCA_data_youwin$HTA_status) + 
    ggtitle("PCA results for youwin")

PCA results for Home/Tie/Away give better classification compared to over2.5 results. Especially, the plots with variance covered more separate the points in a more clear way.

TASK 3

This is the photo of Venice. We can also separate the photo into RGB channels.

# importing image
library(jpeg)
foto = readJPEG("582foto.jpg")  #512x512 matrix
dim(foto)  #dimensions of the foto
## [1] 512 512   3
# display
plot(NA, xlim = c(0, nrow(foto)), ylim = c(0, ncol(foto)), axes = FALSE, 
    xlab = "", ylab = "")
rasterImage(foto, 0, 0, nrow(foto), ncol(foto), interpolate = TRUE)

# plotting each channel
par(mfrow = c(1, 1))
image(t(apply(foto[, , 1], 2, rev)), col = rgb(c(0:255)/255, 
    0, 0), useRaster = TRUE, xlab = "red channel", axes = FALSE)

image(t(apply(foto[, , 2], 2, rev)), col = rgb(0, c(0:255)/255, 
    0), useRaster = TRUE, xlab = "green channel", axes = FALSE)

image(t(apply(foto[, , 3], 2, rev)), col = rgb(0, 0, c(0:255)/255), 
    useRaster = TRUE, xlab = "blue channel", axes = FALSE)

We can add noise with uniform distribution[0,0.1] to each channel.Since some values in the photo matrix will be bigger than 1, I divide each matrix value into maximum of the matrix values for normalization. Otherwise, we would not print the image.

# adding noise
par(mfrow = c(1, 1))
length <- dim(foto)[1] * dim(foto)[2]
noise1 <- array(runif(length, 0, 0.1), dim(foto[, , 1]))
noisified_foto1 <- foto[, , 1] + noise1
noise2 <- array(runif(length, 0, 0.1), dim(foto[, , 2]))
noisified_foto2 <- foto[, , 2] + noise2
noise3 <- array(runif(length, 0, 0.1), dim(foto[, , 3]))
noisified_foto3 <- foto[, , 3] + noise3

noisified_foto = array(c(noisified_foto1, noisified_foto2, noisified_foto3), 
    dim = c(512, 512, 3))
# we should normalize this since 1 is the most
norm_noisy_foto = noisified_foto/max(noisified_foto)

# plotting
plot(NA, xlim = c(0, nrow(foto)), ylim = c(0, ncol(foto)), axes = FALSE, 
    xlab = "Original Photo", ylab = "")
rasterImage(foto, 0, 0, nrow(foto), ncol(foto), asp = 1)

plot(NA, xlim = c(0, nrow(foto)), ylim = c(0, ncol(foto)), axes = FALSE, 
    xlab = "Photo with Noise", ylab = "")
rasterImage(norm_noisy_foto, 0, 0, nrow(foto), ncol(foto), asp = 1)

Then, we can display each RGB channel for noisy image.

# display each channel
par(mfrow = c(1, 1))
image(t(apply(norm_noisy_foto[, , 1], 2, rev)), col = rgb(c(0:255)/255, 
    0, 0), useRaster = TRUE, axes = FALSE, xlab = "red channel with noise")

image(t(apply(norm_noisy_foto[, , 2], 2, rev)), col = rgb(0, 
    c(0:255)/255, 0), useRaster = TRUE, axes = FALSE, xlab = "green channel with noise")

image(t(apply(norm_noisy_foto[, , 3], 2, rev)), col = rgb(0, 
    0, c(0:255)/255), useRaster = TRUE, axes = FALSE, xlab = "blue channel with noise")

Each pixel values in the RGB channels are summed and divided into 3 for scaling purpose.You can see grayscaled photo below.

# grayscale
grayscaled_foto = (norm_noisy_foto[, , 1] + norm_noisy_foto[, 
    , 2] + norm_noisy_foto[, , 3])/3
par(mfrow = c(1, 1))
plot(NA, xlim = c(0, nrow(grayscaled_foto)), ylim = c(0, ncol(grayscaled_foto)), 
    axes = FALSE, xlab = "", ylab = "", main = "Grayscaled Photo")
rasterImage(grayscaled_foto, 0, 0, nrow(grayscaled_foto), ncol(grayscaled_foto), 
    interpolate = TRUE)

I selected patch size as 3 and this makes 260100 submatrices. Submatrices are generated by using submat function. Then, matrices are transformed into 9-length vectors. After these steps, I apply PCA to this matrix.

# generating submatrices for patches
submat <- function(m, nrow, ncol) {
    stopifnot(nrow(m) >= nrow, ncol(m) >= ncol)
    rowstarts <- 1:(nrow(m) - nrow + 1)
    colstarts <- 1:(ncol(m) - ncol + 1)
    ss <- function(r, c) {
        m[r:(r + nrow - 1), c:(c + ncol - 1), drop = FALSE]
    }
    with(expand.grid(r = rowstarts, c = colstarts), mapply(ss, 
        r, c, SIMPLIFY = FALSE))
}

M = grayscaled_foto
patchsize = 3
patchelements = patchsize * patchsize
noOfpatch = (ncol(M) - patchsize + 1) * (nrow(M) - patchsize + 
    1)
patchvector = matrix(NA, noOfpatch, patchsize^2)
submatrices = submat(M, patchsize, patchsize)
for (k in 1:noOfpatch) {
    for (m in 1:patchelements) {
        patchvector[k, m] = as.vector(submatrices[[k]])[m]
    }
}
colnames(patchvector) <- c("Patch11", "Patch21", "Patch31", "Patch12", 
    "Patch22", "Patch32", "Patch13", "Patch23", "Patch33")
head(patchvector, 5)
##        Patch11   Patch21   Patch31   Patch12   Patch22   Patch32   Patch13
## [1,] 0.3538217 0.1553754 0.1647029 0.5033511 0.1945327 0.1427333 0.7074717
## [2,] 0.1553754 0.1647029 0.3175932 0.1945327 0.1427333 0.2573433 0.3045501
## [3,] 0.1647029 0.3175932 0.3806743 0.1427333 0.2573433 0.3669935 0.1801212
## [4,] 0.3175932 0.3806743 0.4048617 0.2573433 0.3669935 0.4605521 0.1374010
## [5,] 0.3806743 0.4048617 0.3615382 0.3669935 0.4605521 0.3329236 0.3239305
##        Patch23   Patch33
## [1,] 0.3045501 0.1801212
## [2,] 0.1801212 0.1374010
## [3,] 0.1374010 0.3239305
## [4,] 0.3239305 0.4464895
## [5,] 0.4464895 0.3273183
# apply PCA
PCA_Image <- prcomp(patchvector, center = TRUE, scale. = TRUE)
PCA_Image
## Standard deviations (1, .., p=9):
## [1] 2.8165397 0.7055225 0.4362074 0.4185938 0.2605658 0.2373746 0.1836596
## [8] 0.1688524 0.1317626
## 
## Rotation (n x k) = (9 x 9):
##               PC1           PC2          PC3        PC4         PC5
## Patch11 0.3281996  4.047501e-01 -0.378581457 -0.2457172  0.50074145
## Patch21 0.3325453  4.248949e-01  0.003906962 -0.2437822 -0.01698049
## Patch31 0.3288059  3.943078e-01  0.391929958 -0.2317763 -0.49696999
## Patch12 0.3386715  9.803015e-03 -0.447836454  0.4514724  0.03123789
## Patch22 0.3432465  9.143869e-05  0.002491892  0.4995973 -0.03179043
## Patch32 0.3386299 -9.781374e-03  0.453557855  0.4467487  0.03046569
## Patch13 0.3288540 -3.937485e-01 -0.394085008 -0.2267814 -0.49701087
## Patch23 0.3325404 -4.248400e-01 -0.007487234 -0.2437227 -0.01938444
## Patch33 0.3281319 -4.055174e-01  0.376050855 -0.2502664  0.50167752
##                PC6          PC7           PC8        PC9
## Patch11 -0.1891923  0.384907699  0.2141443643  0.2102715
## Patch21  0.4385515 -0.136988727 -0.5485530690 -0.3703253
## Patch31 -0.2397539 -0.234614875  0.3677425110  0.1853076
## Patch12 -0.2787417 -0.527353455  0.1362225208 -0.3247917
## Patch22  0.5212328 -0.001059869 -0.0008853066  0.5999016
## Patch32 -0.2786646  0.527892141 -0.1349528706 -0.3232190
## Patch13 -0.2414411  0.232915404 -0.3688273176  0.1857081
## Patch23  0.4398364  0.138689365  0.5477197134 -0.3693341
## Patch33 -0.1887127 -0.384420063 -0.2125959934  0.2087317
summary(PCA_Image)
## Importance of components:
##                           PC1     PC2     PC3     PC4     PC5     PC6
## Standard deviation     2.8165 0.70552 0.43621 0.41859 0.26057 0.23737
## Proportion of Variance 0.8814 0.05531 0.02114 0.01947 0.00754 0.00626
## Cumulative Proportion  0.8814 0.93674 0.95788 0.97735 0.98489 0.99116
##                            PC7     PC8     PC9
## Standard deviation     0.18366 0.16885 0.13176
## Proportion of Variance 0.00375 0.00317 0.00193
## Cumulative Proportion  0.99490 0.99807 1.00000
ggbiplot(PCA_Image)

In the summary, PC1 and PC2 cover almost 94% of the variance which is quite good. The standard deviation of the first components is higher than other components. Also, plot of the results show us how points are aggregated. Now, we can reconstruct the image by using PC1.

# mapping to reconstruct image
mapped_image = PCA_Image$x
first_comp = mapped_image[, 1]
second_comp = mapped_image[, 2]
third_comp = mapped_image[, 3]

# transform to matrix form
first_mapped_matrix = matrix(first_comp, nrow = 510, ncol = 510)
first_renorm = renorm(first_mapped_matrix, min = 0, max = 1)
second_mapped_matrix = matrix(second_comp, nrow = 510, ncol = 510)
second_renorm = renorm(second_mapped_matrix, min = 0, max = 1)
third_mapped_matrix = matrix(third_comp, nrow = 510, ncol = 510)
third_renorm = renorm(third_mapped_matrix, min = 0, max = 1)

par(mfrow = c(1, 1))
plot(NA, xlim = c(0, nrow(first_renorm)), ylim = c(0, ncol(first_renorm)), 
    axes = FALSE, xlab = "Reconstructing Photo with PC1", ylab = "")
rasterImage(first_renorm, 0, 0, nrow(first_renorm), ncol(first_renorm), 
    interpolate = TRUE)

plot(NA, xlim = c(0, nrow(second_renorm)), ylim = c(0, ncol(second_renorm)), 
    axes = FALSE, xlab = "Reconstructing Photo with PC2", ylab = "")
rasterImage(second_renorm, 0, 0, nrow(second_renorm), ncol(second_renorm), 
    interpolate = TRUE)

plot(NA, xlim = c(0, nrow(third_renorm)), ylim = c(0, ncol(third_renorm)), 
    axes = FALSE, xlab = "Reconstructing Photo with PC3", ylab = "")
rasterImage(third_renorm, 0, 0, nrow(third_renorm), ncol(third_renorm), 
    interpolate = TRUE)

PCA_components = PCA_Image$rotation
first_comp = (PCA_components[, 1])
FirstCompImage = matrix(first_comp, nrow = 3, ncol = 3)
FirstCompImage = renorm(FirstCompImage, min = 0, max = 1)

second_comp = (PCA_components[, 2])
SecondCompImage = matrix(second_comp, nrow = 3, ncol = 3)
SecondCompImage = renorm(SecondCompImage, min = 0, max = 1)

third_comp = (PCA_components[, 3])
ThirdCompImage = matrix(third_comp, nrow = 3, ncol = 3)
ThirdCompImage = renorm(ThirdCompImage, min = 0, max = 1)

As you can see, the eigenvector image of the first component is looking symmetric which means that the pixel values are equally weighted. Also, for PC2, when we go left to the right, the colours turn into black. For PC3, bottom to the top, colours will be black. It basically shows the how values are spreaded among each eigenvector.

plot(NA, xlim = c(0, nrow(FirstCompImage)), ylim = c(0, ncol(FirstCompImage)), 
    axes = FALSE, xlab = "Eigenvector 1", ylab = "")
rasterImage(FirstCompImage, 0, 0, 3, 3, interpolate = FALSE)

plot(NA, xlim = c(0, nrow(SecondCompImage)), ylim = c(0, ncol(SecondCompImage)), 
    axes = FALSE, xlab = "Eigenvector 2", ylab = "")
rasterImage(SecondCompImage, 0, 0, 3, 3, interpolate = FALSE)

plot(NA, xlim = c(0, nrow(ThirdCompImage)), ylim = c(0, ncol(ThirdCompImage)), 
    axes = FALSE, xlab = "Eigenvector 3", ylab = "")
rasterImage(ThirdCompImage, 0, 0, 3, 3, interpolate = FALSE)

APPENDIX

# bookmaker2 = bet365 ou1.5
bet365_ou = odd_rawdata[bookmaker == "bet365" & betType == "ou" & 
    totalhandicap == 1.5]
bet365_ou
bet365_ou_rev = bet365_ou[, `:=`(row_number, rank(-date)), by = c("matchId", 
    "oddtype")][row_number == 1]

bet365_ou_data = dcast(bwin_ou_rev, matchId ~ oddtype, value.var = "odd")
colnames(bet365_ou_data) <- c("matchId", "over1.5", "under1.5")
# ou3.5
bet365_ou_3 = odd_rawdata[bookmaker == "bet365" & betType == 
    "ou" & totalhandicap == 3.5]
bet365_ou_3
bet365_ou3_rev = bet365_ou_3[, `:=`(row_number, rank(-date)), 
    by = c("matchId", "oddtype")][row_number == 1]

bet365_ou3_data = dcast(bet365_ou3_rev, matchId ~ oddtype, value.var = "odd")
colnames(bet365_ou3_data) <- c("matchId", "over3.5", "under3.5")

# ha2
bet365_ha2 = odd_rawdata[bookmaker == "bet365" & betType == "ha"]
bet365_ha2
bet365_ha2_rev = bet365_ha2[, `:=`(row_number, rank(-date)), 
    by = c("matchId", "oddtype")][row_number == 1]
bet365_ha2_rev

bet365_ha2_data = dcast(bet365_ha2_rev, matchId ~ oddtype, value.var = "odd")
colnames(bet365_ha2_data) <- c("matchId", "ha1", "ha2")

# 1x2
bet365_1x2 = odd_rawdata[bookmaker == "bet365" & betType == "1x2"]
bet365_1x2
bet365_1x2_rev = bet365_1x2[, `:=`(row_number, rank(-date)), 
    by = c("matchId", "oddtype")][row_number == 1]

bet365_1x2_data = dcast(bet365_1x2_rev, matchId ~ oddtype, value.var = "odd")

merged_1 = merge(bet365_ou_data, bet365_ou3_data, by = "matchId")
merged_2 = merge(merged_1, bet365_ha2_data, by = "matchId")
merged_3 = merge(merged_2, bet365_1x2_data, by = "matchId")

PCA_data_bet365 = merge(merged_3, matches_rawdata_[, c(2, 5, 
    10)], by = "matchId")
PCA_data_bet365[, "ou_status"] <- sapply(PCA_data_bet365$over_flag, 
    function(x) if (x == 1) return("over") else if (x == 0) return("under"))

PCA_bet365_result <- prcomp(PCA_data_bet365[, c(2:10)], center = TRUE, 
    scale. = TRUE)
head(PCA_data_bet365, 10)
PCA_bet365_result
summary(PCA_bet365_result)
ggbiplot(PCA_bet365_result, ellipse = TRUE, groups = PCA_data_bet365$ou_status) + 
    ggtitle("PCA results for bet365")
# bookmaker3 = Unibet 1x2
Unibet_1x2 = odd_rawdata[bookmaker == "Unibet" & betType == "1x2"]
Unibet_1x2
Unibet_1x2_rev = Unibet_1x2[, `:=`(row_number, rank(-date)), 
    by = c("matchId", "oddtype")][row_number == 1]

Unibet_1x2_data = dcast(Unibet_1x2_rev, matchId ~ oddtype, value.var = "odd")
# ou0.5
Unibet_ou = odd_rawdata[bookmaker == "Unibet" & betType == "ou" & 
    totalhandicap == 0.5]
Unibet_ou
Unibet_ou_rev = Unibet_ou[, `:=`(row_number, rank(-date)), by = c("matchId", 
    "oddtype")][row_number == 1]

Unibet_ou_data = dcast(Unibet_ou_rev, matchId ~ oddtype, value.var = "odd")
colnames(Unibet_ou_data) <- c("matchId", "over0.5", "under0.5")

# dc
Unibet_dc = odd_rawdata[bookmaker == "Unibet" & betType == "dc"]
Unibet_dc
Unibet_dc_rev = Unibet_dc[, `:=`(row_number, rank(-date)), by = c("matchId", 
    "oddtype")][row_number == 1]

Unibet_dc_data = dcast(Unibet_dc_rev, matchId ~ oddtype, value.var = "odd")

merged_1 = merge(Unibet_ou_data, Unibet_1x2_data, by = "matchId")
merged_2 = merge(merged_1, Unibet_dc_data, by = "matchId")

PCA_data_Unibet = merge(merged_2, matches_rawdata_[, c(2, 5, 
    10)], by = "matchId")
PCA_data_Unibet[, "ou_status"] <- sapply(PCA_data_Unibet$over_flag, 
    function(x) if (x == 1) return("over") else if (x == 0) return("under"))

PCA_Unibet_result <- prcomp(PCA_data_Unibet[, c(2:9)], center = TRUE, 
    scale. = TRUE)
head(PCA_data_Unibet, 10)
PCA_Unibet_result
summary(PCA_Unibet_result)
ggbiplot(PCA_Unibet_result, ellipse = TRUE, groups = PCA_data_Unibet$ou_status) + 
    ggtitle("PCA results for Unibet")
# bookmaker4 = youwin dc1x2
youwin_dc = odd_rawdata[bookmaker == "youwin" & betType == "dc"]
youwin_dc
youwin_dc_rev = youwin_dc[, `:=`(row_number, rank(-date)), by = c("matchId", 
    "oddtype")][row_number == 1]

youwin_dc_data = dcast(youwin_dc_rev, matchId ~ oddtype, value.var = "odd")
# btsYESNO
youwin_bts = odd_rawdata[bookmaker == "youwin" & betType == "bts"]
youwin_bts
youwin_bts_rev = youwin_bts[, `:=`(row_number, rank(-date)), 
    by = c("matchId", "oddtype")][row_number == 1]

youwin_bts_data = dcast(youwin_bts_rev, matchId ~ oddtype, value.var = "odd")

# ha
youwin_ha = odd_rawdata[bookmaker == "youwin" & betType == "ha"]
youwin_ha
youwin_ha_rev = youwin_ha[, `:=`(row_number, rank(-date)), by = c("matchId", 
    "oddtype")][row_number == 1]

youwin_ha_data = dcast(youwin_ha_rev, matchId ~ oddtype, value.var = "odd")
colnames(youwin_ha_data) <- c("matchId", "ha1", "ha2")

merged_1 = merge(youwin_dc_data, youwin_bts_data, by = "matchId")
merged_2 = merge(merged_1, youwin_ha_data, by = "matchId")

PCA_data_youwin = merge(merged_2, matches_rawdata_[, c(2, 5, 
    10)], by = "matchId")
PCA_data_youwin[, "ou_status"] <- sapply(PCA_data_youwin$over_flag, 
    function(x) if (x == 1) return("over") else if (x == 0) return("under"))

PCA_youwin_result <- prcomp(PCA_data_youwin[, c(2:8)], center = TRUE, 
    scale. = TRUE)
head(PCA_data_youwin, 10)
PCA_youwin_result
summary(PCA_youwin_result)
ggbiplot(PCA_youwin_result, ellipse = TRUE, groups = PCA_data_youwin$ou_status) + 
    ggtitle("PCA results for youwin")
# bookmaker5 = WH
WH_odd2 = odd_rawdata[bookmaker == "William Hill" & betType == 
    "1x2"]
WH_odd2
WH_odd_rev = WH_odd2[, `:=`(row_number, rank(-date)), by = c("matchId", 
    "oddtype")][row_number == 1]

WH_odd_data = dcast(WH_odd_rev, matchId ~ oddtype, value.var = "odd")
# over4.5 için
WH_ou = odd_rawdata[bookmaker == "William Hill" & betType == 
    "ou" & totalhandicap == 4.5]
WH_ou
WH_ou_rev = WH_ou[, `:=`(row_number, rank(-date)), by = c("matchId", 
    "oddtype")][row_number == 1]

WH_ou_data = dcast(WH_ou_rev, matchId ~ oddtype, value.var = "odd")
colnames(WH_ou_data) <- c("matchId", "over4.5", "under4.5")

# ha12
WH_ha1 = odd_rawdata[bookmaker == "William Hill" & betType == 
    "ha"]
WH_ha1
WH_ha1_rev = WH_ha1[, `:=`(row_number, rank(-date)), by = c("matchId", 
    "oddtype")][row_number == 1]
WH_ha1_rev

WH_ha1_data = dcast(WH_ha1_rev, matchId ~ oddtype, value.var = "odd")
colnames(WH_ha1_data) <- c("matchId", "ha1", "ha2")

merged_1 = merge(WH_odd_data, WH_ou_data, by = "matchId")
merged_2 = merge(merged_1, WH_ha1_data, by = "matchId")

PCA_data_WH = merge(merged_2, matches_rawdata_[, c(2, 5, 10)], 
    by = "matchId")
PCA_data_WH[, "ou_status"] <- sapply(PCA_data_WH$over_flag, function(x) if (x == 
    1) return("over") else if (x == 0) return("under"))

PCA_WH_result <- prcomp(PCA_data_WH[, c(2:8)], center = TRUE, 
    scale. = TRUE)
head(PCA_data_WH, 10)
PCA_WH_result
summary(PCA_WH_result)
ggbiplot(PCA_WH_result, ellipse = TRUE, groups = PCA_data_WH$ou_status) + 
    ggtitle("PCA results for William Hill")